Exploratory data analysis on data from Spotify about the albums and musics of Jackson do Pandeiro. The original data come from this repository. The explanation about how the data were generatedis available at the Spotify API documentation.
In the analysis below we could:
data <- read_csv(here::here("data/jackson.csv"),
col_types = cols(
.default = col_double(),
album_uri = col_character(),
album_name = col_character(),
album_img = col_character(),
album_release_date = col_character(),
album_release_year = col_date(format = ""),
album_popularity = col_integer(),
track_name = col_character(),
track_uri = col_character(),
key = col_character(),
mode = col_character(),
time_signature = col_integer(),
key_mode = col_character(),
track_popularity = col_integer()
)) %>%
mutate(album_name = gsub(".*(1954).*",
"The Music of Brazil/Jackson do Pandeiro",
album_name));
data %>%
glimpse()
Observations: 500
Variables: 23
$ album_uri <chr> "5T9tTjPIfjbUJGRJdYOOLl", "5T9tTjPIfjbUJGRJdYOOLl", "5T9tTjPIfjbUJGRJdYOOLl", "5T9tTjPIfjbUJGRJdYOOLl", ...
$ album_name <chr> "Jackson Do Pandeiro Volume 1: Tum, Tum, Tum!", "Jackson Do Pandeiro Volume 1: Tum, Tum, Tum!", "Jackson...
$ album_img <chr> "https://i.scdn.co/image/5dcc4a0cad740f1ee0774196d0a14f3693ef8879", "https://i.scdn.co/image/5dcc4a0cad7...
$ album_release_date <chr> "1958-11-11", "1958-11-11", "1958-11-11", "1958-11-11", "1958-11-11", "1958-11-11", "1958-11-11", "1958-...
$ album_release_year <date> 1958-11-11, 1958-11-11, 1958-11-11, 1958-11-11, 1958-11-11, 1958-11-11, 1958-11-11, 1958-11-11, 1958-11...
$ album_popularity <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 12, 12, 12, 12,...
$ track_name <chr> "Tum, Tum, Tum", "Pacífico Pacato", "Nortista Quatrocentão", "Sem Querer", "Vou Sambar", "Boi da Cara Pr...
$ track_uri <chr> "6cCYhV6fU68uzbjWPG9V7x", "6Gu7y9SgtVTGh8YGhDPtCe", "1hq7M7cJtvDgQbQsgVuUAY", "7LbixXwP54FZ3CIYdT6SU6", ...
$ danceability <dbl> 0.501, 0.663, 0.550, 0.447, 0.544, 0.571, 0.495, 0.572, 0.500, 0.579, 0.563, 0.568, 0.541, 0.565, 0.290,...
$ energy <dbl> 0.987, 0.962, 0.947, 0.969, 0.972, 0.926, 0.967, 0.986, 0.947, 0.984, 0.985, 0.927, 0.980, 0.994, 0.934,...
$ key <chr> "A", "F", "D", "G", "E", "F", "E", "C", "F", "A#", "E", "F", "D#", "D", "F", "G", "F", "F", "E", "C", "G...
$ loudness <dbl> 2.561, 1.137, 1.621, 2.743, 2.513, 2.414, 2.375, 2.597, 3.078, 3.070, 2.919, 2.268, 2.470, 3.498, 1.640,...
$ mode <chr> "major", "major", "major", "major", "minor", "major", "minor", "major", "major", "major", "major", "majo...
$ speechiness <dbl> 0.0429, 0.1810, 0.0469, 0.0549, 0.0502, 0.0344, 0.0576, 0.0367, 0.0418, 0.0386, 0.0547, 0.0481, 0.0836, ...
$ acousticness <dbl> 0.718, 0.738, 0.666, 0.759, 0.787, 0.651, 0.712, 0.194, 0.286, 0.312, 0.233, 0.860, 0.543, 0.287, 0.809,...
$ instrumentalness <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ liveness <dbl> 0.2820, 0.2000, 0.2510, 0.3330, 0.1760, 0.3420, 0.3210, 0.3010, 0.3230, 0.1340, 0.1190, 0.3190, 0.3040, ...
$ valence <dbl> 0.963, 0.961, 0.923, 0.899, 0.783, 0.961, 0.755, 0.989, 0.957, 0.979, 0.963, 0.896, 0.972, 0.981, 0.597,...
$ tempo <dbl> 101.676, 113.562, 116.125, 116.023, 112.863, 133.065, 117.822, 102.425, 110.203, 98.414, 95.766, 136.613...
$ duration_ms <dbl> 158133, 139773, 163173, 143733, 151653, 157480, 158133, 154680, 184120, 167200, 169720, 161733, 170507, ...
$ time_signature <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,...
$ key_mode <chr> "A major", "F major", "D major", "G major", "E minor", "F major", "E minor", "C major", "F major", "A# m...
$ track_popularity <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 4, 3, 4, 15, 13, ...
data %>%
ggplot(aes(sample=danceability)) +
stat_qq()
hchart (data$danceability,
color = "#B71C1C",
name = "Danceability")
data %>%
ggplot(aes(sample=speechiness)) +
stat_qq()
hchart (data$speechiness,
color = "#B71C1C",
name = "Speechiness")
data <- data %>%
mutate(duration_s = duration_ms/1000)
data %>%
select(duration_s) %>%
glimpse()
Observations: 500
Variables: 1
$ duration_s <dbl> 158.133, 139.773, 163.173, 143.733, 151.653, 157.480, 158.133, 154.680, 184.120, 167.200, 169.720, 161.733, 170....
data %>%
ggplot(aes(sample=duration_s)) +
stat_qq()
hchart (data$duration_s,
color = "#B71C1C",
name = "Duration (s)")
data <- data %>%
mutate(remastered = album_release_date > "1981-30-12")
data %>%
select(album_name, album_release_year, remastered) %>%
sample_n(10)
temp <- data %>%
distinct(album_name,
.keep_all = TRUE) %>%
mutate(remastered = ifelse(remastered == TRUE,"remastered","original"))
hchart(temp$remastered,
colorByPoint = TRUE,
name="Álbum")
temp <-
data %>%
distinct(album_name, .keep_all = TRUE) %>%
group_by(album_release_year) %>%
summarise(original_n = sum(!remastered),
remastered_n = sum(remastered))
highchart() %>%
hc_xAxis(categories = temp$album_release_year) %>%
hc_add_series(temp$original_n,
type = "column",
color = "#B71C1C",
name = "Not remastered") %>%
hc_add_series(temp$remastered_n,
type = "column",
name = "Remastered") %>%
hc_title(text = "Number of albums per year")
p <- data %>%
distinct(track_name, .keep_all = TRUE) %>%
ggplot(aes(x=speechiness,
y=danceability)) +
geom_point(alpha=0.4)
ggplotly(p)
There doesn’t seem to be a clear relation between the danceability of Jackson’s songs and their speechiness. We have many tracks of different levels of danceability and on the same low level of speechiness. When we increase the speechiness of the songs (more to the right of the x axis) the danceability stays relatively stable (The increase in danceability is marginal).
data %>%
distinct(track_name, .keep_all = TRUE) %>%
ggplot(aes(speechiness,danceability)) +
stat_density2d(aes(fill = ..level..), geom = "polygon") +
scale_x_continuous(breaks = seq(0,1,0.01))
It’s possible to see more clearly that point of higher occurrences (area of highest level) is around 0.7 of danceability and 0.04 of speechiness (a very low level of speechiness). This implies something we expected, Jackson doesn’t need to talk much to make their songs highly danceable, it’s for good reason that Jackson was nicknamed the king of rithm.
m <- list(
l =70,
b = 150)
p <- data %>%
ggplot(aes(x=as.factor(album_release_year),
duration_s,
group=album_release_year,
color=remastered)) +
geom_boxplot(position = "dodge", alpha=0.6) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
labs(x="Album release year", y="Track duration (s)")
ggplotly(p) %>%
layout(autosize = F, margin=m)
Olhando para os álbuns originais (remastered = FALSE) é possível perceber um grande aumento no tempo de duração das músicas no álbum em 1967 seguido de uma diminuição no álbum seguinte 1968. Esse choque no perfil dos dois álbuns requer uma explicação que não está nos dados.
O álbum de 1967 A Braza do Norte, primeiro gravado na gravadora Cantagalo marca também um momento especial na vida do cantor, pois esse é o ano de seu divórcio de Almira, parceira na música e esposa. De músicas mais longas e de faixas de letra mais triste como “Passarinho abandonado” composta pelo próprio Jackson, esse perfil incomum do álbum acaba por refletir esse momento da vida do cantor.
O álbum de 1968 por outro lado é uma coletânea e reúne algumas das músicas até então mais populares (e tambpem curtas do cantor). Essa coletânea casa com um inaudito porém curto hiato do cantor que pela primeira vez passa um ano sem publicar um álbum, o qual pode ser atribuído ao seu divórcio.
# lollipop chart
m <- list(
l = 370)
p <- data %>%
ggplot(aes(album_popularity,y=reorder(album_name,album_popularity),
group=remastered)) +
geom_segment(aes(x = 0, y = reorder(album_name,album_popularity),
xend = album_popularity, yend = album_name)) +
geom_point(aes(color=remastered)) +
theme(axis.title.y=element_blank()) +
scale_color_discrete(" Remastered")
ggplotly(p,tooltip=NA) %>%
layout(autosize = F,
margin = m)
Fica evidente que para Jackson do Pandeiro os álbuns remasterizados/relançados dominam o cenário do Spotify em termos de popularidade. Primeiramente, simplesmente observando a proporção entre remasterizados e não remasterizados fica evidente que os primeiros são os que o Spotify mais disponibiliza. Segundo, levando em consideração a questão de qualidade de áudio era de se esperar que os remasterizados tomassem a dianteira.
tim12equal = c("#00008F", "#0000EA", "#0047FF", "#00A2FF", "#00FEFF", "#5AFFA5", "#B5FF4A", "#FFED00", "#FF9200", "#FF3700", "#DB0000", "#800000")
m <- list(
l=45,
r=40);
p <- data %>%
select(key,album_release_date, remastered) %>%
group_by(album_release_date,key,remastered) %>%
summarise(counted = n()) %>%
ungroup() %>%
group_by(album_release_date) %>%
mutate(rel_freq = counted/sum(counted)) %>%
ggplot(aes(x = factor(album_release_date),
y = rel_freq,
fill = factor(key),
text = paste(
"Proportion:",
round(rel_freq * 100,2),"%"))) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = tim12equal) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank()) +
facet_wrap(remastered ~ ., nrow = 2) +
ggtitle("Notas musicais (distinção por remasterização)") +
guides(fill=guide_legend(title="Musical\nNotes"))
ggplotly(p, tooltip="text") %>%
layout(autosize = F,
margin = m,
legend = list(
orientation = "h", y =-0.2
)) %>%
layout(legend = list(
orientation = "v"
))